import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import plotly.colors as colors
from datetime import datetime
data=pd.read_csv(r"C:\Users\vishal\Downloads\SuperStore_Sales_Dataset.csv",encoding='unicode_escape')
#it is showing how many rows and column we have in this dataset#
data.shape
data['Order Date'] = pd.to_datetime(data['Order Date'])
#with this function we will get first 10 rows#
data.head(10)
| Row ID+O6G3A1:R6 | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Category | Sub-Category | Product Name | Sales | Quantity | Profit | Returns | Payment Mode | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4918 | CA-2019-160304 | 2019-01-01 | 07-01-2019 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 73.940 | 1 | 28.2668 | NaN | Online | NaN | NaN |
| 1 | 4919 | CA-2019-160304 | 2019-02-01 | 07-01-2019 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Furniture | Bookcases | Bush Westfield Collection Bookcases, Medium Ch... | 173.940 | 3 | 38.2668 | NaN | Online | NaN | NaN |
| 2 | 4920 | CA-2019-160304 | 2019-02-01 | 07-01-2019 | Standard Class | BM-11575 | Brendan Murry | Corporate | United States | Gaithersburg | ... | Technology | Phones | GE 30522EE2 | 231.980 | 2 | 67.2742 | NaN | Cards | NaN | NaN |
| 3 | 3074 | CA-2019-125206 | 2019-03-01 | 05-01-2019 | First Class | LR-16915 | Lena Radford | Consumer | United States | Los Angeles | ... | Office Supplies | Storage | Recycled Steel Personal File for Hanging File ... | 114.460 | 2 | 28.6150 | NaN | Online | NaN | NaN |
| 4 | 8604 | US-2019-116365 | 2019-03-01 | 08-01-2019 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | ... | Technology | Accessories | Imation Clip USB flash drive - 8 GB | 30.080 | 2 | -5.2640 | NaN | Online | NaN | NaN |
| 5 | 8605 | US-2019-116365 | 2019-03-01 | 08-01-2019 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | ... | Technology | Accessories | WD My Passport Ultra 1TB Portable External Har... | 165.600 | 3 | -6.2100 | NaN | Online | NaN | NaN |
| 6 | 8606 | US-2019-116365 | 2019-03-01 | 08-01-2019 | Standard Class | CA-12310 | Christine Abelman | Corporate | United States | San Antonio | ... | Technology | Phones | AT&T 17929 Lendline Telephone | 180.960 | 5 | 13.5720 | NaN | Cards | NaN | NaN |
| 7 | 9494 | CA-2019-105207 | 2019-03-01 | 08-01-2019 | Standard Class | BO-11350 | Bill Overfelt | Corporate | United States | Broken Arrow | ... | Furniture | Tables | Hon Practical Foundations 30 x 60 Training Tab... | 1592.850 | 7 | 350.4270 | NaN | COD | NaN | NaN |
| 8 | 9495 | CA-2019-105207 | 2019-03-01 | 08-01-2019 | Standard Class | BO-11350 | Bill Overfelt | Corporate | United States | Broken Arrow | ... | Office Supplies | Binders | Storex Dura Pro Binders | 11.880 | 2 | 5.3460 | NaN | COD | NaN | NaN |
| 9 | 2898 | US-2019-164630 | 2019-04-01 | 09-01-2019 | Standard Class | EB-13975 | Erica Bern | Corporate | United States | Charlotte | ... | Technology | Copiers | Hewlett Packard 310 Color Digital Copier | 959.968 | 4 | 119.9960 | NaN | Online | NaN | NaN |
10 rows × 23 columns
# here checking if any column value have null values#
data.isnull().sum()
# now i am getting summary if any data type is missing#
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5901 entries, 0 to 5900 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row ID+O6G3A1:R6 5901 non-null int64 1 Order ID 5901 non-null object 2 Order Date 5901 non-null datetime64[ns] 3 Ship Date 5901 non-null object 4 Ship Mode 5901 non-null object 5 Customer ID 5901 non-null object 6 Customer Name 5901 non-null object 7 Segment 5901 non-null object 8 Country 5901 non-null object 9 City 5901 non-null object 10 State 5901 non-null object 11 Region 5901 non-null object 12 Product ID 5901 non-null object 13 Category 5901 non-null object 14 Sub-Category 5901 non-null object 15 Product Name 5901 non-null object 16 Sales 5901 non-null float64 17 Quantity 5901 non-null int64 18 Profit 5901 non-null float64 19 Returns 287 non-null float64 20 Payment Mode 5901 non-null object 21 ind1 0 non-null float64 22 ind2 0 non-null float64 dtypes: datetime64[ns](1), float64(5), int64(2), object(15) memory usage: 1.0+ MB
#here i am looking at descriptive statististic of dataset#
data.describe()
| Row ID+O6G3A1:R6 | Sales | Quantity | Profit | Returns | ind1 | ind2 | |
|---|---|---|---|---|---|---|---|
| count | 5901.000000 | 5901.000000 | 5901.000000 | 5901.000000 | 287.0 | 0.0 | 0.0 |
| mean | 5022.422471 | 265.345589 | 3.781901 | 29.700408 | 1.0 | NaN | NaN |
| std | 2877.977184 | 474.260645 | 2.212917 | 259.589138 | 0.0 | NaN | NaN |
| min | 1.000000 | 0.836000 | 1.000000 | -6599.978000 | 1.0 | NaN | NaN |
| 25% | 2486.000000 | 71.976000 | 2.000000 | 1.795500 | 1.0 | NaN | NaN |
| 50% | 5091.000000 | 128.648000 | 3.000000 | 8.502500 | 1.0 | NaN | NaN |
| 75% | 7456.000000 | 265.170000 | 5.000000 | 28.615000 | 1.0 | NaN | NaN |
| max | 9994.000000 | 9099.930000 | 14.000000 | 8399.976000 | 1.0 | NaN | NaN |
data.dtypes
Row ID+O6G3A1:R6 int64 Order ID object Order Date datetime64[ns] Ship Date object Ship Mode object Customer ID object Customer Name object Segment object Country object City object State object Region object Product ID object Category object Sub-Category object Product Name object Sales float64 Quantity int64 Profit float64 Returns float64 Payment Mode object ind1 float64 ind2 float64 dtype: object
#here we know that ind1 and ind2 column has no values so we are going to drop this column#
data.drop(['ind1','ind2'], axis=1, inplace=True)
data = data.rename(columns={'Order_ID':'Order_ID','Order Date': 'Order_Date','Ship Date': 'Ship_Date','Customer ID': 'Customer_ID','Customer Name': 'Customer_Name','Product ID': 'Product_ID','Product Name':'Product_Name','Payment Mode': 'Payment_Mode'})
data.columns = data.columns.str.strip()
data.dtypes
Row ID+O6G3A1:R6 int64 Order ID object Order_Date datetime64[ns] Ship_Date object Ship Mode object Customer_ID object Customer_Name object Segment object Country object City object State object Region object Product_ID object Category object Sub-Category object Product_Name object Sales float64 Quantity int64 Profit float64 Returns float64 Payment_Mode object dtype: object
print('order_date' in data.columns)
print(data.columns)
data.columns = data.columns.str.strip()
False
Index(['Row ID+O6G3A1:R6', 'Order ID', 'Order_Date', 'Ship_Date',
'Ship Mode', 'Customer_ID', 'Customer_Name', 'Segment', 'Country',
'City', 'State', 'Region', 'Product_ID', 'Category', 'Sub-Category',
'Product_Name', 'Sales', 'Quantity', 'Profit', 'Returns',
'Payment_Mode'],
dtype='object')
#Exploratory Data analysis#
#First we will do Overall Sales Analysis#
#Q. What is the total sales revenue?
Total_Revenue= data["Sales"].sum()
print("Total Sales Revenue; $", Total_Revenue)
Total Sales Revenue; $ 1565804.3232
print(data.columns)
Index(['Row ID+O6G3A1:R6', 'Order ID', 'Order_Date', 'Ship_Date',
'Ship Mode', 'Customer_ID', 'Customer_Name', 'Segment', 'Country',
'City', 'State', 'Region', 'Product_ID', 'Category', 'Sub-Category',
'Product_Name', 'Sales', 'Quantity', 'Profit', 'Returns',
'Payment_Mode'],
dtype='object')
#Top seling products by quantity and revenue#
top_products_quantity = data.groupby('Product_Name')['Quantity'].sum().sort_values(ascending=False)
# Group by product and calculate the total revenue generated
top_products_revenue = data.groupby('Product_Name')['Profit'].sum().sort_values(ascending=False)
# Print the top-selling products in terms of quantity and revenue
print("Top-selling products by quantity:")
print(top_products_quantity.head())
print("\nTop-selling products by revenue:")
print(top_products_revenue.head())
Top-selling products by quantity: Product_Name Staples 124 Easy-staple paper 89 Staple envelope 73 Staples in misc. colors 60 Chromcraft Round Conference Tables 59 Name: Quantity, dtype: int64 Top-selling products by revenue: Product_Name Canon imageCLASS 2200 Advanced Copier 25199.9280 Hewlett Packard LaserJet 3310 Copier 6407.8932 3D Systems Cube Printer, 2nd Generation, Magenta 3717.9714 HP Designjet T520 Inkjet Large Format Printer - 24" Color 2799.9840 GBC DocuBind TL300 Electric Binding System 2753.7593 Name: Profit, dtype: float64
#Top selling Products by category and sub-category
# Group by category and calculate the total revenue generated
top_categories_revenue = data.groupby('Category')['Profit'].sum().sort_values(ascending=False)
# Group by sub-category and calculate the total revenue generated
top_subcategories_revenue = data.groupby('Sub-Category')['Profit'].sum().sort_values(ascending=False)
# Print the top-selling categories and sub-categories by revenue
print("Top-selling categories by revenue:")
print(top_categories_revenue.head())
print("\nTop-selling sub-categories by revenue:")
print(top_subcategories_revenue.head())
Top-selling categories by revenue: Category Technology 90458.2486 Office Supplies 74797.2461 Furniture 10006.6112 Name: Profit, dtype: float64 Top-selling sub-categories by revenue: Sub-Category Copiers 42774.5828 Accessories 25336.6455 Phones 22308.9179 Paper 21112.3779 Binders 17885.3759 Name: Profit, dtype: float64
#Pie chart to know the sales by category
sales_by_category = data.groupby('Category')['Sales'].sum().reset_index()
category_pie= px.pie(sales_by_category,
values='Sales',
names='Category',
hole=0.5,
color_discrete_sequence=px.colors.qualitative.Pastel)
category_pie.update_traces(textposition='inside', textinfo='percent+label')
category_pie.update_layout(title_text='Sales Analysis by Category', title_font=dict(size=24))
category_pie.show()
#Sales by Sub-Category bar chart
sales_by_subcategory=data.groupby('Sub-Category')['Sales'].sum().reset_index()
sub_category_bar=px.bar(sales_by_subcategory,
x='Sub-Category',
y='Sales',
title='Sales Analysis by Sub-Category')
sub_category_bar.show()
# Please run this if you facing problem to see charts
data.reset_index(inplace=True)
#finding monthly sales trends
data['Order_Date'] = pd.to_datetime(data['Order_Date'])
data.set_index('Order_Date', inplace=True)
# Resample the data to monthly frequency and calculate the total quantities sold
monthly_sales = data['Quantity'].resample('M').sum()
# Plot the monthly sales trend
plt.figure(figsize=(8, 6))
plt.plot(monthly_sales)
plt.title('Monthly Sales Trend')
plt.xlabel('Order_Date')
plt.ylabel('Quantity Sold')
plt.show()
#top selling segment by quantity and revenue#
top_products_quantity = data.groupby('Segment')['Quantity'].sum().sort_values(ascending=False)
# Group by Segment and calculate the total revenue generated
top_products_revenue = data.groupby('Segment')['Profit'].sum().sort_values(ascending=False)
# Print the top-selling segments in terms of quantity and revenue
print("Top-selling products segment by quantity:")
print(top_products_quantity.head())
print("\nTop-selling products by revenue:")
print(top_products_revenue.head())
Top-selling products segment by quantity: Segment Consumer 11199 Corporate 6865 Home Office 4253 Name: Quantity, dtype: int64 Top-selling products by revenue: Segment Consumer 81338.5875 Corporate 57805.7991 Home Office 36117.7193 Name: Profit, dtype: float64
# Calculate sales and profit by segment
sales_profit_by_segment = data.groupby('Segment').sum().reset_index()
# Define color palette
color_palette = ['#FFA500', '#008000']
# Create a bar chart
segment_bar= go.Figure([
go.Bar(x=sales_profit_by_segment['Segment'], y=sales_profit_by_segment['Sales'], name='Sales', marker_color=color_palette[0]),
go.Bar(x=sales_profit_by_segment['Segment'], y=sales_profit_by_segment['Profit'], name='Profit', marker_color=color_palette[1])
])
# Update layout
segment_bar.update_layout(title='Sales and Profit Analysis by Customer Segment', xaxis_title='Customer Segment', yaxis_title='Amount')
# Show the chart
segment_bar.show()
#product return rate and return cost
return_rate = (data['Returns'].sum() / data['Profit'].sum()) * 100
return_cost = data['Returns'].sum()
print("Return rate: ", return_rate)
print("Return cost: ", return_cost)
Return rate: 0.163754736670661 Return cost: 287.0
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 5901 entries, 2019-01-01 to 2020-12-31 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 5901 non-null int64 1 Row ID+O6G3A1:R6 5901 non-null int64 2 Order ID 5901 non-null object 3 Ship_Date 5901 non-null object 4 Ship Mode 5901 non-null object 5 Customer_ID 5901 non-null object 6 Customer_Name 5901 non-null object 7 Segment 5901 non-null object 8 Country 5901 non-null object 9 City 5901 non-null object 10 State 5901 non-null object 11 Region 5901 non-null object 12 Product_ID 5901 non-null object 13 Category 5901 non-null object 14 Sub-Category 5901 non-null object 15 Product_Name 5901 non-null object 16 Sales 5901 non-null float64 17 Quantity 5901 non-null int64 18 Profit 5901 non-null float64 19 Returns 287 non-null float64 20 Payment_Mode 5901 non-null object dtypes: float64(3), int64(3), object(15) memory usage: 1014.2+ KB